#-----------------------------------------------------------------------------#
# Pay it forward (digitally) 
# Data analysis
# Jeff Allen
# Last updated: June 12, 2021
#-----------------------------------------------------------------------------#

library(tidyverse)
library(sandwich)
library(lmtest)
library(margins)
library(gridExtra)
library(reshape2)
library(stargazer)
library(survey)
library(lmtest)
library(sampleSelection)
library(pscl)
library(dotwhisker)

## Turn off scientific notation
options(scipen = 999)

# Import and Formatting ---------------------------------------------------

micro.df <- read.csv("micro_dp_2017.csv", stringsAsFactors = FALSE)
micro.14 <- read.csv("micro_dp_2014.csv", stringsAsFactors = FALSE)

summary(micro.df)
summary(micro.14)

## Transform vars
micro.df$educ <- factor(micro.df$educ)
micro.14$educ <- factor(micro.14$educ)

micro.df$inc_q <- factor(micro.df$inc_q)
micro.14$inc_q <- factor(micro.14$inc_q)

micro.df$age_g <- factor(micro.df$age_g)
micro.14$age_g <- factor(micro.14$age_g)

## Select modeling vars 
df <- 
  select(micro.df, 
         economycode, regionwb, wpid_random, wgt, # ID 
         mdp, ewage, instrument, # DV, IV, NC
         account, # Key OV
         female, age_g, educ, inc_q, inlf, # Demos
         saved # FR
         )

df.14 <- 
  select(micro.14, 
         economycode, regionwb, wpid_random, wgt, # ID 
         mdp, ewage, instrument, # DV, IV, NC
         account, # Key OV
         female, age_g, educ, inc_q, inlf, # Demos
         saved # FR
  )

# Sample Characteristics --------------------------------------------------

## Raw
summary(df)
prop.table(table(df$age_g))
prop.table(table(df$inc_q))
prop.table(table(df$educ))

## Survey

## Specify survey design
df.w <- svydesign(ids=~1, data=df, weights=df$wgt)

## Weighted sample
svymean(~mdp + instrument + ewage + saved + 
          age_g + educ + 
          inlf + inc_q + female, 
        df.w, na.rm = TRUE)

## How many countries per region?
regions.df <- unique(df[1:2])
aggregate(economycode ~ regionwb, data = regions.df, length)
aggregate(wpid_random ~ regionwb, data = df, length)

# Model -------------------------------------------------------------------

## Listwise deletion (models will do this anyway)
df <- na.omit(df)
df.14 <- na.omit(df.14)

# First, probit selection equation
probit.1 <- 
  glm(instrument ~ 
        age_g + educ + female + 
        inlf + inc_q + 
        saved,
      family = binomial(link = "probit"), weights = df$wgt, data = df)

probit.2 <- 
  glm(instrument ~ 
        age_g + educ + female + 
        inlf + inc_q + 
        saved,
      family = binomial(link = "probit"), weights = df.14$wgt, data = df.14)

## Store IMR
df$IMR <- invMillsRatio(probit.1)$IMR1
df.14$IMR <- invMillsRatio(probit.2)$IMR1

## Outcome equation with IMR
logit.1 <-
  glm(mdp ~ 
        ewage + 
        age_g + educ + female + 
        inc_q + inlf + 
        IMR,
      data = df[df$instrument == 1,], 
      weights = df[df$instrument == 1,]$wgt, 
      family = binomial(link = "logit"))

logit.2 <-
  glm(mdp ~ 
        ewage + 
        age_g + educ + female + 
        inc_q + inlf + 
        IMR,
      data = df.14[df.14$instrument == 1,], 
      weights = df.14[df.14$instrument == 1,]$wgt, 
      family = binomial(link = "logit"))

## Summarize
summary(logit.1)
summary(logit.2)

## Cluster robust Standard Errors
logit.1.vcov <- vcovCL(logit.1, cluster = ~economycode)
logit.2.vcov <- vcovCL(logit.2, cluster = ~economycode)

## Store cse
logit.1.cse <- sqrt(diag(logit.1.vcov))
logit.2.cse <- sqrt(diag(logit.2.vcov))

## Summarize with CSE
(model.1 <- coeftest(logit.1, vcov = logit.1.vcov))
(model.2 <- coeftest(logit.2, vcov = logit.2.vcov))

## Margins
m.1 <- margins(logit.1, vcov = logit.1.vcov)
m.2 <- margins(logit.2, vcov = logit.2.vcov)

# Dot and Whisker ---------------------------------------------------------

## Try with package
m1.dw <- summary(m.1)
m2.dw <- summary(m.2)

## Remove IMR (we'll display in statistical appendix)
m1.dw <- m1.dw[m1.dw$factor != "IMR",]
m2.dw <- m2.dw[m2.dw$factor != "IMR",]

## Store new var names
var.names <- c("Age: 30-44", "Age: 45-59", "Age: 60 & Up",
               "Education: secondary", "Education: tertiary",
               "Electronic wages",
               "Female", 
               "Income: quintile 2", "Income: quintile 3", 
               "Income: quintile 4", "Income: quintile 5", 
               "Employed"
)

## Rename vars
m1.dw$factor <- var.names
m2.dw$factor <- var.names

## Organize summary DFs
m1.dw <- select(m1.dw, factor, AME, SE)
m2.dw <- select(m2.dw, factor, AME, SE)

## Rename columns
colnames(m1.dw) <- c("term", "estimate", "std.error")
colnames(m2.dw) <- c("term", "estimate", "std.error")

## Add model identifier
m1.dw$model <- 1
m2.dw$model <- 0

## Bind
m.dw <- rbind(m2.dw, m1.dw)

## Re-arrange
m.dw <- select(m.dw, model, term, estimate, std.error)

## Re-order factor
m.dw$term <- 
  factor(m.dw$term,
         levels = c(
           "Electronic wages",
           "Age: 30-44",
           "Age: 45-59",
           "Age: 60 & Up",
           "Education: secondary",
           "Education: tertiary",
           "Female",
           "Employed",
           "Income: quintile 2",
           "Income: quintile 3",
           "Income: quintile 4",
           "Income: quintile 5"
         ))

## 800 X 462 (depends on screen)
dwplot(m.dw,
       vline = geom_vline(xintercept = 0, linetype = 2), # plot line at zero _behind_ coefs
  dot_args = list(aes(shape = model), size = 2.2),
  whisker_args = list(size = 0.6)) +
  xlab("") + ylab("") +
  scale_colour_grey(start = .1, end = .1, # if start and end same value, use same colour for all models
                    name = "Year",
                    breaks = c(0, 1),
                    labels = c("2014", "2017"),
                    guide = guide_legend(reverse = TRUE)) +
  scale_shape_discrete(name = "Year",
                       breaks = c(0, 1),
                       labels = c("2014", "2017"),
                       guide = guide_legend(reverse = TRUE)) + 
  labs(
    title = "Marginal effects on digital payment propensity",
    subtitle = "Bands represent 95% confidence intervals",
    caption = 
      "Categorical variable baselines: Age (15-29); Education (primary); Income (quintile 1)",
    x = "", y = "")

# Regional ----------------------------------------------------------------

## Create new DFs
eap.df <- df[df$regionwb == "East Asia & Pacific (excluding high income)",]
eca.df <- df[df$regionwb == "Europe & Central Asia (excluding high income)",]
lac.df <- df[df$regionwb == "Latin America & Caribbean (excluding high income)",]
mena.df <- df[df$regionwb == "Middle East & North Africa (excluding high income)",]
noecd.df <- df[df$regionwb == "High income: nonOECD",]
oecd.df <- df[df$regionwb == "High income: OECD",]
sas.df <- df[df$regionwb == "South Asia",]
ssa.df <- df[df$regionwb == "Sub-Saharan Africa (excluding high income)",]

## Probit selection
probit.eap <- 
  glm(instrument ~ age_g + educ + inlf + female + inc_q + saved, 
      family = binomial(link = "probit"), 
      data = eap.df, weights = eap.df$wgt)

probit.eca <- 
  glm(instrument ~ age_g + educ + inlf + female + inc_q + saved, 
      family = binomial(link = "probit"), 
      data = eca.df, weights = eca.df$wgt)

probit.lac <- 
  glm(instrument ~ age_g + educ + inlf + female + inc_q + saved, 
      family = binomial(link = "probit"), 
      data = lac.df, weights = lac.df$wgt)

probit.mena <- 
  glm(instrument ~ age_g + educ + inlf + female + inc_q + saved, 
      family = binomial(link = "probit"), 
      data = mena.df, weights = mena.df$wgt)

probit.noecd <- 
  glm(instrument ~ age_g + educ + inlf + female + inc_q + saved, 
      family = binomial(link = "probit"), 
      data = noecd.df, weights = noecd.df$wgt)

probit.oecd <- 
  glm(instrument ~ age_g + educ + inlf + female + inc_q + saved, 
      family = binomial(link = "probit"), 
      data = oecd.df, weights = oecd.df$wgt)

probit.sas <- 
  glm(instrument ~ age_g + educ + inlf + female + inc_q + saved, 
      family = binomial(link = "probit"), 
      data = sas.df, weights = sas.df$wgt)

probit.ssa <- 
  glm(instrument ~ age_g + educ + inlf + female + inc_q + saved, 
      family = binomial(link = "probit"), 
      data = ssa.df, weights = ssa.df$wgt)

## IMRs
eap.df$IMR <- invMillsRatio(probit.eap)$IMR1
eca.df$IMR <- invMillsRatio(probit.eca)$IMR1
lac.df$IMR <- invMillsRatio(probit.lac)$IMR1
mena.df$IMR <- invMillsRatio(probit.mena)$IMR1
noecd.df$IMR <- invMillsRatio(probit.noecd)$IMR1
oecd.df$IMR <- invMillsRatio(probit.oecd)$IMR1
sas.df$IMR <- invMillsRatio(probit.sas)$IMR1
ssa.df$IMR <- invMillsRatio(probit.ssa)$IMR1

## Logit models
logit.eap <-
  glm(mdp ~ ewage + age_g + educ + female + inc_q + inlf + IMR,
      data = eap.df[eap.df$instrument == 1,], 
      weights = eap.df[eap.df$instrument == 1,]$wgt, 
      family = binomial(link = "logit"))

logit.eca <-
  glm(mdp ~ ewage + age_g + educ + female + inc_q + inlf + IMR,
      data = eca.df[eca.df$instrument == 1,], 
      weights = eca.df[eca.df$instrument == 1,]$wgt,
      family = binomial(link = "logit"))

logit.lac <-
  glm(mdp ~ ewage + age_g + educ + female + inc_q + inlf + IMR,
      data = lac.df[lac.df$instrument == 1,], 
      weights = lac.df[lac.df$instrument == 1,]$wgt,
      family = binomial(link = "logit"))

logit.mena <-
  glm(mdp ~ ewage + age_g + educ + female + inc_q + inlf + IMR,
      data = mena.df[mena.df$instrument == 1,], 
      weights = mena.df[mena.df$instrument == 1,]$wgt,
      family = binomial(link = "logit"))

logit.noecd <-
  glm(mdp ~ ewage + age_g + educ + female + inc_q + inlf + IMR,
      data = noecd.df[noecd.df$instrument == 1,], 
      weights = noecd.df[noecd.df$instrument == 1,]$wgt,
      family = binomial(link = "logit"))

logit.oecd <-
  glm(mdp ~ ewage + age_g + educ + female + inc_q + inlf + IMR,
      data = oecd.df[oecd.df$instrument == 1,], 
      weights = oecd.df[oecd.df$instrument == 1,]$wgt,
      family = binomial(link = "logit"))

logit.sas <-
  glm(mdp ~ ewage + age_g + educ + female + inc_q + inlf + IMR,
      data = sas.df[sas.df$instrument == 1,], 
      weights = sas.df[sas.df$instrument == 1,]$wgt,
      family = binomial(link = "logit"))

logit.ssa <-
  glm(mdp ~ ewage + age_g + educ + female + inc_q + inlf + IMR,
      data = ssa.df[ssa.df$instrument == 1,], 
      weights = ssa.df[ssa.df$instrument == 1,]$wgt,
      family = binomial(link = "logit"))

## Summarize
summary(logit.eap)
summary(logit.eca)
summary(logit.lac)
summary(logit.mena)
summary(logit.sas)
summary(logit.ssa)
summary(logit.oecd)
summary(logit.noecd)

## Regional differences are substantial
## Education effects mostly driven by high income countries

## Cluster robust Standard Errors
logit.eap.vcov <- vcovCL(logit.eap, cluster = ~economycode)
logit.eca.vcov <- vcovCL(logit.eca, cluster = ~economycode)
logit.lac.vcov <- vcovCL(logit.lac, cluster = ~economycode)
logit.mena.vcov <- vcovCL(logit.mena, cluster = ~economycode)
logit.noecd.vcov <- vcovCL(logit.noecd, cluster = ~economycode)
logit.oecd.vcov <- vcovCL(logit.oecd, cluster = ~economycode)
logit.sas.vcov <- vcovCL(logit.sas, cluster = ~economycode)
logit.ssa.vcov <- vcovCL(logit.ssa, cluster = ~economycode)

## Store cse
logit.eap.cse <- sqrt(diag(logit.eap.vcov))
logit.eca.cse <- sqrt(diag(logit.eca.vcov))
logit.lac.cse <- sqrt(diag(logit.lac.vcov))
logit.mena.cse <- sqrt(diag(logit.mena.vcov))
logit.noecd.cse <- sqrt(diag(logit.noecd.vcov))
logit.oecd.cse <- sqrt(diag(logit.oecd.vcov))
logit.sas.cse <- sqrt(diag(logit.sas.vcov))
logit.ssa.cse <- sqrt(diag(logit.ssa.vcov))

## Summarize with CSE
(model.eap <- coeftest(logit.eap, vcov = logit.eap.vcov))
(model.eca <- coeftest(logit.eca, vcov = logit.eca.vcov))
(model.lac <- coeftest(logit.lac, vcov = logit.lac.vcov))
(model.mena <- coeftest(logit.mena, vcov = logit.mena.vcov))
(model.noecd <- coeftest(logit.noecd, vcov = logit.noecd.vcov))
(model.oecd <- coeftest(logit.oecd, vcov = logit.oecd.vcov))
(model.sas <- coeftest(logit.sas, vcov = logit.sas.vcov))
(model.ssa <- coeftest(logit.ssa, vcov = logit.ssa.vcov))

## Margins
m.eap <- margins(logit.eap, vcov = logit.eap.vcov)
m.eca <- margins(logit.eca, vcov = logit.eca.vcov)
m.lac <- margins(logit.lac, vcov = logit.lac.vcov)
m.mena <- margins(logit.mena, vcov = logit.mena.vcov)
m.noecd <- margins(logit.noecd, vcov = logit.noecd.vcov)
m.oecd <- margins(logit.oecd, vcov = logit.oecd.vcov)
m.sas <- margins(logit.sas, vcov = logit.sas.vcov)
m.ssa <- margins(logit.ssa, vcov = logit.ssa.vcov)

## Summary
m.eap.df <- summary(m.eap)
m.eca.df <- summary(m.eca)
m.lac.df <- summary(m.lac)
m.mena.df <- summary(m.mena)
m.noecd.df <- summary(m.noecd)
m.oecd.df <- summary(m.oecd)
m.sas.df <- summary(m.sas)
m.ssa.df <- summary(m.ssa)

## Remove IMR
m.eap.df <- m.eap.df[m.eap.df$factor != "IMR",]
m.eca.df <- m.eca.df[m.eca.df$factor != "IMR",]
m.lac.df <- m.lac.df[m.lac.df$factor != "IMR",]
m.mena.df <- m.mena.df[m.mena.df$factor != "IMR",]
m.noecd.df <- m.noecd.df[m.noecd.df$factor != "IMR",]
m.oecd.df <- m.oecd.df[m.oecd.df$factor != "IMR",]
m.sas.df <- m.sas.df[m.sas.df$factor != "IMR",]
m.ssa.df <- m.ssa.df[m.ssa.df$factor != "IMR",]

## Rename
m.eap.df$factor <- var.names
m.eca.df$factor <- var.names
m.lac.df$factor <- var.names
m.mena.df$factor <- var.names
m.noecd.df$factor <- var.names
m.oecd.df$factor <- var.names
m.sas.df$factor <- var.names
m.ssa.df$factor <- var.names

## Organize summary DFs
m.eap.df <- select(m.eap.df, factor, AME, lower, upper, everything())
m.eca.df <- select(m.eca.df, factor, AME, lower, upper, everything())
m.lac.df <- select(m.lac.df, factor, AME, lower, upper, everything())
m.mena.df <- select(m.mena.df, factor, AME, lower, upper, everything())
m.noecd.df <- select(m.noecd.df, factor, AME, lower, upper, everything())
m.oecd.df <- select(m.oecd.df, factor, AME, lower, upper, everything())
m.sas.df <- select(m.sas.df, factor, AME, lower, upper, everything())
m.ssa.df <- select(m.ssa.df, factor, AME, lower, upper, everything())

## Melt
m.eap.df <- melt(m.eap.df[1:4])
m.eca.df <- melt(m.eca.df[1:4])
m.lac.df <- melt(m.lac.df[1:4])
m.mena.df <- melt(m.mena.df[1:4])
m.noecd.df <- melt(m.noecd.df[1:4])
m.oecd.df <- melt(m.oecd.df[1:4])
m.sas.df <- melt(m.sas.df[1:4])
m.ssa.df <- melt(m.ssa.df[1:4])

## Add identifier
m.eap.df$region <- "East Asia & Pacific"
m.eca.df$region <- "Europe & Central Asia"
m.noecd.df$region <- "High income: non-OECD"
m.oecd.df$region <- "High income: OECD"
m.lac.df$region <- "Latin America & Caribbean"
m.mena.df$region <- "Middle East & North Africa"
m.sas.df$region <- "South Asia"
m.ssa.df$region <- "Sub-Saharan Africa"

## Bind
region.df <- rbind(
  m.eap.df, m.eca.df, m.noecd.df, m.oecd.df, m.lac.df, m.mena.df, m.sas.df,
  m.ssa.df
)

## Re-order factor
region.df$factor <- 
  factor(region.df$factor,
         levels = c(
           "Income: quintile 5",
           "Income: quintile 4",
           "Income: quintile 3",
           "Income: quintile 2",
           "Employed",
           "Female",
           "Education: tertiary",
           "Education: secondary",
           "Age: 60 & Up",
           "Age: 45-59",
           "Age: 30-44",
           "Electronic wages"
         ))

## Plot (1100 X 725) 
ggplot(data = region.df) + 
  stat_summary(
    mapping = aes(x = factor, y = value),
    fun.min = min, fun.max = max, fun = median
  ) +
  facet_wrap(~ region) + 
  geom_hline(yintercept = 0, linetype = 2) + 
  coord_flip(ylim = c(-0.4, 0.4)) + 
  labs(
    title = "Marginal effects on digital payment propensity (2017)",
    subtitle = "Bands represent 95% confidence intervals",
    caption = "Unless otherwise noted, groups exclude high-income countries.",
    x = "", y = "")

# Appendix ----------------------------------------------------------------

## Probit selection equations

probit.names <- c("Age: 30-44", "Age: 45-59", "Age: 60 and Up",
                  "Education: secondary", "Education: tertiary",
                  "Female", "Employed",
                  "Income: quintile 2", "Income: quintile 3", 
                  "Income: quintile 4", "Income: quintile 5", 
                  "Saved")

stargazer(probit.1, probit.2, 
          out = "probit.html",
          title = "Probit Selection Equations",
          no.space = TRUE,
          dep.var.labels = "E-Instrument",
          covariate.labels = probit.names,
          align = TRUE)

## Adoption: Global

## Save margins
m.2017 <- summary(m.1)
m.2014 <- summary(m.2)

## Re-order
m.2017 <- m.2017[c(6, 1:5, 7, 9:13, 8),]
m.2014 <- m.2014[c(6, 1:5, 7, 9:13, 8),]

## Store coefficients
c.2017 <- c(m.2017$AME, logit.1$coefficients[1])
c.2014 <- c(m.2014$AME, logit.2$coefficients[1])

## Store SEs
se.2017 <- c(m.2017$SE, logit.1.cse[1])
se.2014 <- c(m.2014$SE, logit.2.cse[1])

## Rename SEs
names(se.2017) <- names(c.2017)
names(se.2014) <- names(c.2014)

## Store variable names
logit.names <- c("Electronic wages",
                 "Age: 30-44", "Age: 45-59", "Age: 60 and Up",
                 "Education: secondary", "Education: tertiary",
                 "Female", 
                 "Income: quintile 2", "Income: quintile 3", 
                 "Income: quintile 4", "Income: quintile 5", 
                 "Employed", "IMR"
)

stargazer(logit.1, logit.2, 
          coef = list(c.2017, c.2014), 
          se = list(se.2017, se.2014),
          out = "logit.html",
          title = "Main models",
          no.space = TRUE,
          covariate.labels = logit.names,
          dep.var.labels = "Made digital payment",
          align = TRUE)

## Regions

## Save margins
m.eap.2 <- summary(m.eap)
m.eca.2 <- summary(m.eca)
m.lac.2 <- summary(m.lac)
m.mena.2 <- summary(m.mena)
m.noecd.2 <- summary(m.noecd)
m.oecd.2 <- summary(m.oecd)
m.sas.2 <- summary(m.sas)
m.ssa.2 <- summary(m.ssa)

## Re-order
m.eap.2 <- m.eap.2[c(6, 1:5, 7, 9:13, 8),]
m.eca.2 <- m.eca.2[c(6, 1:5, 7, 9:13, 8),]
m.lac.2 <- m.lac.2[c(6, 1:5, 7, 9:13, 8),]
m.mena.2 <- m.mena.2[c(6, 1:5, 7, 9:13, 8),]
m.noecd.2 <- m.noecd.2[c(6, 1:5, 7, 9:13, 8),]
m.oecd.2 <- m.oecd.2[c(6, 1:5, 7, 9:13, 8),]
m.sas.2 <- m.sas.2[c(6, 1:5, 7, 9:13, 8),]
m.ssa.2 <- m.ssa.2[c(6, 1:5, 7, 9:13, 8),]

## Store coefficients
c.eap <- c(m.eap.2$AME, logit.eap$coefficients[1])
c.eca <- c(m.eca.2$AME, logit.eca$coefficients[1])
c.lac <- c(m.lac.2$AME, logit.lac$coefficients[1])
c.mena <- c(m.mena.2$AME, logit.mena$coefficients[1])
c.noecd <- c(m.noecd.2$AME, logit.noecd$coefficients[1])
c.oecd <- c(m.oecd.2$AME, logit.oecd$coefficients[1])
c.sas <- c(m.sas.2$AME, logit.sas$coefficients[1])
c.ssa <- c(m.ssa.2$AME, logit.ssa$coefficients[1])

## Store SEs
se.eap <- c(m.eap.2$SE, logit.eap.cse[1])
se.eca <- c(m.eca.2$SE, logit.eca.cse[1])
se.lac <- c(m.lac.2$SE, logit.lac.cse[1])
se.mena <- c(m.mena.2$SE, logit.mena.cse[1])
se.noecd <- c(m.noecd.2$SE, logit.noecd.cse[1])
se.oecd <- c(m.oecd.2$SE, logit.oecd.cse[1])
se.sas <- c(m.sas.2$SE, logit.sas.cse[1])
se.ssa <- c(m.ssa.2$SE, logit.ssa.cse[1])

## Rename SEs
names(se.eap) <- names(c.eap)
names(se.eca) <- names(c.eca)
names(se.lac) <- names(c.lac)
names(se.mena) <- names(c.mena)
names(se.noecd) <- names(c.noecd)
names(se.oecd) <- names(c.oecd)
names(se.sas) <- names(c.sas)
names(se.ssa) <- names(c.ssa)

## Stargazer
stargazer(logit.eap, logit.eca, logit.lac, logit.mena, 
          logit.noecd, logit.oecd, logit.sas, logit.ssa,
          coef = list(c.eap, c.eca, c.lac, c.mena, c.noecd,
                      c.oecd, c.sas, c.ssa), 
          se = list(se.eap, se.eca, se.lac, se.mena, se.noecd, 
                    se.oecd, se.sas, se.ssa),
          out = "regions.html",
          title = "Regional usage models",
          no.space = TRUE,
          covariate.labels = logit.names,
          dep.var.labels = "Made digital payment",
          align = TRUE)
